Code
import polars as pl
import altair as alt
from read_parquet_and_reorder import read_parquet_and_reorder
from one_hot_encode import one_hot_encode
from blog import logger

alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
Code
logger.setLevel("INFO")
Code
df = read_parquet_and_reorder("df.parquet")
logger.info(df.shape)
df_per_100g = df.select("code", *[c for c in df.columns if c.endswith("_100g")])
df = df.select(c for c in df.columns if not c.endswith("_100g"))

columns = [
    "categories_en",
    "ingredients_tags",
    "ingredients_analysis_tags",
    "traces_en",
    "food_groups_en",
    "nutrient_levels_tags",
    "main_category_en",
    "packaging_en",
]
df_dict: dict[str, pl.DataFrame] = {
    c: df.pipe(one_hot_encode, c, n=10, remove_prefix=["en:", "de:"]) for c in columns
} | {"nutrients": df_per_100g}
[06/30/23 09:12:04] INFO     (73307, 175)                                                           2632950586.py:2
[06/30/23 09:12:05] INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
Code
df_for_ml = df.select("code")
for key, _df in df_dict.items():
    logger.info(key)
    df_for_ml = df_for_ml.join(_df, on="code", suffix=key)
df_for_ml = df_for_ml.fill_null(0.0)
                    INFO     categories_en                                                           288844683.py:3
                    INFO     ingredients_tags                                                        288844683.py:3
                    INFO     ingredients_analysis_tags                                               288844683.py:3
                    INFO     traces_en                                                               288844683.py:3
                    INFO     food_groups_en                                                          288844683.py:3
                    INFO     nutrient_levels_tags                                                    288844683.py:3
                    INFO     main_category_en                                                        288844683.py:3
                    INFO     packaging_en                                                            288844683.py:3
                    INFO     nutrients                                                               288844683.py:3
Code
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
import collections
import numpy as np


y = df.select("nutriscore_score").to_numpy().flatten()
logger.info(y.shape)

X = df_for_ml.drop("code").to_numpy()
logger.info(X.shape)
[06/30/23 09:12:30] INFO     (73307,)                                                               2128009356.py:9
                    INFO     (73307, 198)                                                          2128009356.py:12
Code
transformer = Normalizer().fit(X)
X_train, X_test, y_train, y_test = train_test_split(
    transformer.transform(X), y, test_size=0.20, random_state=2023
)


# clf = tree.DecisionTreeClassifier(max_depth=15)
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train, y_train)


df_tree = pl.concat(
    [
        pl.DataFrame(
            {
                "actual score": y_test,
                "predicted score": clf.predict(X_test),
                "label": "test",
            }
        ),
        pl.DataFrame(
            {
                "actual score": y_train,
                "predicted score": clf.predict(X_train),
                "label": "train",
            }
        ),
    ]
).with_columns(err=pl.col("predicted score") - pl.col("actual score"))
Code
alt.Chart(df_tree).mark_rect().encode(
    x=alt.X("actual score:N"),
    y=alt.Y("predicted score:N"),
    color=alt.Color("count():Q").scale(scheme="viridis", reverse=True),
    column=alt.Column("label:N"),
).properties(width=500, height=500)